import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import homogeneity_score, silhouette_score, calinski_harabasz_score
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA, FastICA as ICA
from sklearn.random_projection import GaussianRandomProjection as GRP
from scipy.stats import kurtosis as kurt
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, classification_report
# Read data
df_health=pd.read_csv('health.csv')
df_titanic=pd.read_csv('titanic_train.csv')
df_titanic_t=pd.read_csv('titanic_test.csv')
df_titanic.drop('Cabin', axis=1, inplace=True)
df_titanic_t.drop('Cabin', axis=1, inplace=True)
df_titanic.loc[df_titanic['Sex'] == 'male', 'gender'] = int(0)
df_titanic.loc[df_titanic['Sex'] == 'female', 'gender'] = int(1)
df_titanic_t.loc[df_titanic_t['Sex'] == 'male', 'gender'] = int(0)
df_titanic_t.loc[df_titanic_t['Sex'] == 'female', 'gender'] = int(1)
df_titanic.loc[df_titanic['Embarked'] == 'S', 'embarked_n'] = int(0)
df_titanic.loc[df_titanic['Embarked'] == 'C', 'embarked_n'] = int(1)
df_titanic.loc[df_titanic['Embarked'] == 'Q', 'embarked_n'] = int(2)
df_titanic_t.loc[df_titanic_t['Embarked'] == 'S', 'embarked_n'] = int(0)
df_titanic_t.loc[df_titanic_t['Embarked'] == 'C', 'embarked_n'] = int(1)
df_titanic_t.loc[df_titanic_t['Embarked'] == 'Q', 'embarked_n'] = int(2)
df_health_dn=df_health.dropna().reset_index(drop=True)
df_titanic_dn=df_titanic.dropna().reset_index(drop=True)
df_titanic_t_dn=df_titanic_t.dropna().reset_index(drop=True)
df_titanic_f=pd.concat([df_titanic_dn, df_titanic_t_dn], ignore_index=True)
df_health_dn.head()
X_health=df_health_dn.drop('outcome', axis=1)
X_titanic=df_titanic_f.drop(['PassengerId','Survived', 'Name', 'Sex', 'Ticket','Embarked'], axis=1)
scaler_h=StandardScaler()
scaler_h.fit(X_health)
scaled_health=scaler_h.transform(X_health)
scaler_t=StandardScaler()
scaler_t.fit(X_titanic)
scaled_titanic=scaler_t.transform(X_titanic)
def plot_CFmatrix(confusion_matrix):
fig=plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix, annot=True, cmap='Blues', fmt='0.0f')
plt.title('Confusion Matrix')
plt.xlabel('Prediction')
plt.ylabel('True Label')
df_health_dn
df_titanic_f.head()
np.array(scores)[:,1]
# health data
scores=[]
cluster_labels=[]
for i in range(2, 16):
km=KMeans(n_clusters=i)
km.fit(scaled_health)
pred=km.predict(scaled_health)
score=km.score(scaled_health)
ho_score=homogeneity_score(df_health_dn['outcome'], pred)
if i>1:
si_score=silhouette_score(scaled_health, pred)
ca_score=calinski_harabasz_score(scaled_health, pred)
else:
si_score=0
ca_score=0
scores.append([-score, ho_score, si_score, ca_score])
cluster_labels.append(pred)
# plot
num_clusters=np.arange(2,16)
score_types=['Kmeans', 'homogeneity', 'silhouette', 'calinski-harabasz']
i=0
for score_type in score_types:
fig= plt.figure(figsize=(8,5))
plt.plot(num_clusters, np.array(scores)[:,i] , marker='o')
plt.title('Kmeans-{}'.format(score_type))
plt.xlabel('number of clusters')
plt.ylabel('{} score'.format(score_type))
i+=1
scores
health_avg = df_health_dn.drop(['outcome', 'ID'], axis=1).mean()
for i in range (1,4):
health_scaled_kn=pd.DataFrame(scaled_health, columns=X_health.columns).drop(['ID'], axis=1)
health_kn=df_health_dn.drop(['outcome', 'ID'], axis=1)
n=i-1
health_kn['cluster']=cluster_labels[n]
health_scaled_kn['cluster']=cluster_labels[n]
scaled_avg=health_scaled_kn.groupby(['cluster']).mean()
cluster_avg = health_kn.groupby(['cluster']).mean()
relative_imp = cluster_avg / health_avg - 1
#print(cluster_avg)
#print(health_avg)
#print(relative_imp)
features={}
for column in scaled_avg.columns:
features[column]=np.var(scaled_avg[column])
selected_columns=sorted(features, key=features.get, reverse=True)[:5]
#print(selected_columns)
fig=plt.figure(figsize=(12,8))
sns.heatmap(data=relative_imp[selected_columns],annot=True,fmt='.2f',
cmap='RdYlGn',linewidths=2,square=True)
# titanic data
scores_t=[]
cluster_labels_t=[]
for i in range(2, 16):
km=KMeans(n_clusters=i)
km.fit(scaled_titanic)
pred=km.predict(scaled_titanic)
score=km.score(scaled_titanic)
ho_score=homogeneity_score(df_titanic_f['Survived'], pred)
si_score=silhouette_score(scaled_titanic, pred)
ca_score=calinski_harabasz_score(scaled_titanic, pred)
scores_t.append([-score, ho_score, si_score, ca_score])
cluster_labels_t.append(pred)
# plot
num_clusters=np.arange(2,16)
score_types=['Kmeans', 'homogeneity', 'silhouette', 'calinski-harabasz']
i=0
for score_type in score_types:
fig= plt.figure(figsize=(8,5))
plt.plot(num_clusters, np.array(scores_t)[:,i] , marker='o')
plt.title('Kmeans-{}'.format(score_type))
plt.xlabel('number of clusters')
plt.ylabel('{} score'.format(score_type))
i+=1
scores_t
titanic_avg = df_titanic_f.drop('Survived', axis=1).mean()
for i in range (1,4):
titanic_kn=df_titanic_f.drop('Survived', axis=1)
n=i-1
titanic_kn['cluster']=cluster_labels_t[n]
#titanic_kn = titanic_kn.assign(cluster = cluster_labels[n])
cluster_avg = titanic_kn.groupby(['cluster']).mean()
relative_imp = cluster_avg / titanic_avg - 1
#print(relative_imp)
fig=plt.figure(figsize=(12,8))
sns.heatmap(data=relative_imp,annot=True,fmt='.2f',
cmap='RdYlGn',linewidths=2,square=True)
# health data
scores=[]
cluster_labels=[]
for i in range(1, 16):
gm=GaussianMixture(n_components=i)
gm.fit(scaled_health)
pred=gm.predict(scaled_health)
score=gm.score(scaled_health)
aic=gm.aic(scaled_health)
bic=gm.bic(scaled_health)
ho_score=homogeneity_score(df_health_dn['outcome'], pred)
#si_score=silhouette_score(scaled_health, pred)
#ca_score=calinski_harabasz_score(scaled_health, pred)
scores.append([-score, aic, bic, ho_score])
cluster_labels.append(pred)
# plot
num_clusters=np.arange(1,16)
score_types=['EM', 'AIC', 'BIC','homogeneity']
i=0
for score_type in score_types:
fig= plt.figure(figsize=(8,5))
plt.plot(num_clusters, np.array(scores)[:,i] , marker='o')
plt.title('EM-{}'.format(score_type))
plt.xlabel('number of clusters')
plt.ylabel('{} score'.format(score_type))
plt.xlim(left=0)
i+=1
scores
# feature importance plot
health_avg = df_health_dn.drop(['outcome', 'ID'], axis=1).mean()
for i in range (2,5):
health_scaled_kn=pd.DataFrame(scaled_health, columns=X_health.columns).drop(['ID'], axis=1)
health_kn=df_health_dn.drop(['outcome', 'ID'], axis=1)
n=i-1
health_kn['cluster']=cluster_labels[n]
health_scaled_kn['cluster']=cluster_labels[n]
scaled_avg=health_scaled_kn.groupby(['cluster']).mean()
cluster_avg = health_kn.groupby(['cluster']).mean()
relative_imp = cluster_avg / health_avg - 1
#print(cluster_avg)
#print(health_avg)
#print(relative_imp)
features={}
for column in scaled_avg.columns:
features[column]=np.var(scaled_avg[column])
selected_columns=sorted(features, key=features.get, reverse=True)[:5]
#print(selected_columns)
fig=plt.figure(figsize=(12,8))
sns.heatmap(data=relative_imp[selected_columns],annot=True,fmt='.2f',
cmap='RdYlGn',linewidths=2,square=True)
# titanic data
scores_t=[]
cluster_labels=[]
for i in range(1, 16):
gm=GaussianMixture(n_components=i)
gm.fit(scaled_titanic)
pred=gm.predict(scaled_titanic)
score=gm.score(scaled_titanic)
aic=gm.aic(scaled_titanic)
bic=gm.bic(scaled_titanic)
ho_score=homogeneity_score(df_titanic_f['Survived'], pred)
#si_score=silhouette_score(scaled_titanic, pred)
#ca_score=calinski_harabasz_score(scaled_titanic, pred)
scores_t.append([-score, aic, bic, ho_score])
cluster_labels.append(pred)
# plot
num_clusters=np.arange(1,16)
score_types=['EM', 'AIC','BIC','homogeneity']
i=0
for score_type in score_types:
fig= plt.figure(figsize=(8,5))
plt.plot(num_clusters, np.array(scores_t)[:,i] , marker='o')
plt.title('EM-{}'.format(score_type))
plt.xlabel('number of clusters')
plt.ylabel('{} score'.format(score_type))
plt.xlim(left=0)
i+=1
scores_t
# feature importance plot
titanic_avg = df_titanic_f.drop('Survived', axis=1).mean()
for i in range (2,5):
titanic_kn=df_titanic_f.drop('Survived', axis=1)
n=i-1
titanic_kn['cluster']=cluster_labels[n]
#titanic_kn = titanic_kn.assign(cluster = cluster_labels[n])
cluster_avg = titanic_kn.groupby(['cluster']).mean()
relative_imp = cluster_avg / titanic_avg - 1
#print(relative_imp)
fig=plt.figure(figsize=(12,8))
sns.heatmap(data=relative_imp.drop('cluster', axis=1),annot=True,fmt='.2f',
cmap='RdYlGn',linewidths=2,square=True)
def column_names(i, name):
columns=[name+str(x+1) for x in range(i)]
return columns
# PCA
pca=PCA(n_components=40, random_state=3)
pca.fit(scaled_health)
varr_cum=np.cumsum(pca.explained_variance_ratio_)
# plot explained variance ratio
fig=plt.figure(figsize=(12,7))
plt.plot(np.arange(1,41), varr_cum, marker='o', label='cumulative explained variance ratio')
plt.plot(np.arange(1,41), pca.explained_variance_ratio_, marker='o',
label='explained variance ratio')
plt.legend()
plt.xticks(np.arange(1,41))
plt.axhline(y=0.9, linestyle='--', color='r')
plt.title('explained variance ratio -- health')
plt.ylabel('explained variance ratio')
plt.xlabel('number of components')
# plot eigenvalues
fig=plt.figure(figsize=(12,8))
plt.bar(np.arange(1,41), pca.singular_values_)
plt.xticks(np.arange(1,41))
plt.title('Eigenvalues -- health')
plt.ylabel('Eigenvalue')
plt.xlabel('number of components')
# clustering Kmeans
pca=PCA(n_components=32, random_state=3)
pca.fit(scaled_health)
pca_health=pca.transform(scaled_health)
km=KMeans(n_clusters=2)
km.fit(pca_health)
pred=km.predict(pca_health)
score=km.score(pca_health)
ho_score=homogeneity_score(df_health_dn['outcome'], pred)
si_score=silhouette_score(pca_health, pred)
ca_score=calinski_harabasz_score(pca_health, pred)
print(-score, ho_score, si_score, ca_score)
# scatter plot
#print(pca_health)
columns=['pc1','pc2','pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10',
'pc11','pc12','pc13', 'pc14', 'pc15', 'pc16', 'pc17', 'pc18', 'pc19',
'pc21','pc22','pc23', 'pc24', 'pc25', 'pc26', 'pc27', 'pc28', 'pc29',
'pc30', 'pc31', 'pc32', 'pc33']
pca_cluster=pd.DataFrame(data=pca_health, columns=columns)
pca_cluster['cluster label']=pred
pca_cluster['label']=df_health_dn['outcome']
fig=plt.figure(figsize=(7,5))
sns.scatterplot(x=pca_cluster[pca_cluster['cluster label']==0]['pc1'],
y=pca_cluster[pca_cluster['cluster label']==0]['pc2'],
label='cluster 0')
sns.scatterplot(x=pca_cluster[pca_cluster['label']==1]['pc1'],
y=pca_cluster[pca_cluster['label']==1]['pc2'],
label='cluster 1')
#sns.pairplot(pca_cluster.drop(['cluster label', 'label'], axis=1))
plt.title('PCA + cluster pair plot')
# real label
fig=plt.figure(figsize=(7,5))
sns.scatterplot(x=pca_cluster[pca_cluster['label']==0]['pc1'],
y=pca_cluster[pca_cluster['label']==0]['pc2'],
label='real 0')
sns.scatterplot(x=pca_cluster[pca_cluster['label']==1]['pc1'],
y=pca_cluster[pca_cluster['label']==1]['pc2'],
label='real 1')
plt.title('PCA + cluster real lable')
plt.xlabel('pc1')
plt.ylabel('pc2')
cf_matrix=confusion_matrix(pca_cluster['label'], pca_cluster['cluster label'])
plot_CFmatrix(cf_matrix)
# clustering EM
gm=GaussianMixture(n_components=2)
gm.fit(pca_health)
pred=gm.predict(pca_health)
score=gm.score(pca_health)
aic=gm.aic(pca_health)
bic=gm.bic(pca_health)
ho_score=homogeneity_score(df_health_dn['outcome'], pred)
print('score: {}'.format(score))
print('AIC: {}'.format(aic))
print('BIC: {}'.format(bic))
print('homogeneity: {}'.format(ho_score))
pca_cluster['em label']=pred
fig=plt.figure(figsize=(7,5))
sns.scatterplot(x=pca_cluster[pca_cluster['em label']==0]['pc1'],
y=pca_cluster[pca_cluster['em label']==0]['pc2'],
label='cluster 0')
sns.scatterplot(x=pca_cluster[pca_cluster['label']==1]['pc1'],
y=pca_cluster[pca_cluster['label']==1]['pc2'],
label='cluster 1')
#sns.pairplot(pca_cluster.drop(['cluster label', 'label'], axis=1))
plt.title('PCA + EM pair plot')
cf_matrix=confusion_matrix(pca_cluster['label'], pca_cluster['em label'])
plot_CFmatrix(cf_matrix)
avg_kurs=[]
for i in range(1,41):
ica=ICA(n_components=i)
ica.fit(scaled_health)
kurtose=np.mean(kurt(ica.components_))
avg_kurs.append(kurtose)
# plot kurtoses
fig=plt.figure(figsize=(12,8))
plt.plot(np.arange(1,41), avg_kurs, marker='o')
plt.title('ICA Kurtoses - health')
plt.xlabel('number of components')
plt.ylabel('Avg Kurtoses')
ica=ICA(n_components=31, max_iter=1000)
ica.fit(scaled_health)
ica_health=ica.transform(scaled_health)
km=KMeans(n_clusters=2)
km.fit(ica_health)
pred=km.predict(ica_health)
score=km.score(ica_health)
ho_score=homogeneity_score(df_health_dn['outcome'], pred)
si_score=silhouette_score(ica_health, pred)
ca_score=calinski_harabasz_score(ica_health, pred)
print(-score, ho_score, si_score, ca_score)
columns=['ic1','ic2','ic3', 'ic4', 'ic5', 'ic6', 'ic7', 'ic8', 'ic9', 'ic10',
'ic11','ic12','ic13', 'ic14', 'ic15', 'ic16', 'ic17', 'ic18', 'ic19',
'ic21','ic22','ic23', 'ic24', 'ic25', 'ic26', 'ic27', 'ic28', 'ic29',
'ic30', 'ic31', 'ic32']
ica_cluster=pd.DataFrame(data=ica_health, columns=columns)
ica_cluster['cluster label']=pred
ica_cluster['label']=df_health_dn['outcome']
fig=plt.figure(figsize=(7,5))
sns.scatterplot(x=ica_cluster[ica_cluster['cluster label']==0]['ic1'],
y=ica_cluster[ica_cluster['cluster label']==0]['ic2'],
label='cluster 0')
sns.scatterplot(x=ica_cluster[ica_cluster['label']==1]['ic1'],
y=ica_cluster[ica_cluster['label']==1]['ic2'],
label='cluster 1')
#sns.pairplot(pca_cluster.drop(['cluster label', 'label'], axis=1))
plt.title('ICA + Kmeans pair plot')
# real label
fig=plt.figure(figsize=(7,5))
sns.scatterplot(x=ica_cluster[ica_cluster['label']==0]['ic1'],
y=ica_cluster[ica_cluster['label']==0]['ic2'],
label='real 0')
sns.scatterplot(x=ica_cluster[ica_cluster['label']==1]['ic1'],
y=ica_cluster[ica_cluster['label']==1]['ic2'],
label='real 1')
plt.title('ICA + Kmeans real lable')
plt.xlabel('ic1')
plt.ylabel('ic2')
cf_matrix=confusion_matrix(ica_cluster['label'], ica_cluster['cluster label'])
plot_CFmatrix(cf_matrix)
# clustering EM
gm=GaussianMixture(n_components=2)
gm.fit(ica_health)
pred=gm.predict(ica_health)
score=gm.score(ica_health)
aic=gm.aic(ica_health)
bic=gm.bic(ica_health)
ho_score=homogeneity_score(df_health_dn['outcome'], pred)
print('score: {}'.format(score))
print('AIC: {}'.format(aic))
print('BIC: {}'.format(bic))
print('homogeneity: {}'.format(ho_score))
ica_cluster['em label']=pred
fig=plt.figure(figsize=(7,5))
sns.scatterplot(x=ica_cluster[ica_cluster['em label']==0]['ic1'],
y=ica_cluster[ica_cluster['em label']==0]['ic2'],
label='cluster 0')
sns.scatterplot(x=ica_cluster[ica_cluster['label']==1]['ic1'],
y=ica_cluster[ica_cluster['label']==1]['ic2'],
label='cluster 1')
#sns.pairplot(pca_cluster.drop(['cluster label', 'label'], axis=1))
plt.title('ICA + EM pair plot')
# real label
fig=plt.figure(figsize=(7,5))
sns.scatterplot(x=ica_cluster[ica_cluster['label']==0]['ic1'],
y=ica_cluster[ica_cluster['label']==0]['ic2'],
label='real 0')
sns.scatterplot(x=ica_cluster[ica_cluster['label']==1]['ic1'],
y=ica_cluster[ica_cluster['label']==1]['ic2'],
label='real 1')
plt.title('ICA + EM real lable')
plt.xlabel('ic1')
plt.ylabel('ic2')
cf_matrix=confusion_matrix(ica_cluster['label'], ica_cluster['em label'])
plot_CFmatrix(cf_matrix)
fig=plt.figure(figsize=(12,12))
sns.pairplot(ica_cluster[['ic1','ic2','ic3', 'ic4', 'ic5', 'ic6','cluster label']], hue='cluster label')
fig=plt.figure(figsize=(12,12))
sns.pairplot(ica_cluster[['ic1','ic2','ic3', 'ic4', 'ic5', 'ic6','label']], hue='label')
# reconstruction error
def compute_recon(i, X):
rp=GRP(n_components=i, random_state=3)
rp.fit(X)
rp_data=rp.transform(X)
recon_X=np.dot(rp_data, np.linalg.pinv(rp.components_.T))
error=np.mean((X-recon_X)**2)
return error
rec_error_comp=[]
for i in range(1,51):
rec_erros=[]
for j in range(10):
error=compute_recon(i, scaled_health)
#print(error)
rec_errors.append(error)
#print(rec_errors)
rec_error_comp.append(np.mean(rec_errors))
# plot recon_error
fig=plt.figure(figsize=(8,5))
plt.plot(np.arange(1,51), rec_error_comp, marker='o')
plt.title('RP reconstruction error - health')
plt.xlabel('number of components')
plt.ylabel('reconstruction error')
# clustering Kmeans
rp=GRP(n_components=50, random_state=3)
rp.fit(scaled_health)
rp_health=rp.transform(scaled_health)
km=KMeans(n_clusters=2)
km.fit(rp_health)
pred=km.predict(rp_health)
score=km.score(rp_health)
ho_score=homogeneity_score(df_health_dn['outcome'], pred)
si_score=silhouette_score(rp_health, pred)
ca_score=calinski_harabasz_score(rp_health, pred)
print(-score, ho_score, si_score, ca_score)
columns=column_names(50, 'rp')
rp_cluster=pd.DataFrame(data=rp_health, columns=columns)
rp_cluster['cluster label']=pred
rp_cluster['label']=df_health_dn['outcome']
fig=plt.figure(figsize=(7,5))
sns.scatterplot(x=rp_cluster[rp_cluster['cluster label']==0]['rp1'],
y=rp_cluster[rp_cluster['cluster label']==0]['rp2'],
label='cluster 0')
sns.scatterplot(x=rp_cluster[rp_cluster['cluster label']==1]['rp1'],
y=rp_cluster[rp_cluster['cluster label']==1]['rp2'],
label='cluster 1')
#sns.pairplot(pca_cluster.drop(['cluster label', 'label'], axis=1))
plt.title('RP + Kmeans cluster')
plt.xlabel('rp1')
plt.ylabel('rp2')
# real label
fig=plt.figure(figsize=(7,5))
sns.scatterplot(x=rp_cluster[rp_cluster['label']==0]['rp1'],
y=rp_cluster[rp_cluster['label']==0]['rp2'],
label='real 0')
sns.scatterplot(x=rp_cluster[rp_cluster['label']==1]['rp1'],
y=rp_cluster[rp_cluster['label']==1]['rp2'],
label='real 1')
plt.title('RP + Kmeans real label')
plt.xlabel('rp1')
plt.ylabel('rp2')
cf_matrix=confusion_matrix(rp_cluster['label'], rp_cluster['cluster label'])
plot_CFmatrix(cf_matrix)
# clustering EM
gm=GaussianMixture(n_components=2)
gm.fit(rp_health)
pred=gm.predict(rp_health)
score=gm.score(rp_health)
aic=gm.aic(rp_health)
bic=gm.bic(rp_health)
ho_score=homogeneity_score(df_health_dn['outcome'], pred)
print('score: {}'.format(score))
print('AIC: {}'.format(aic))
print('BIC: {}'.format(bic))
print('homogeneity: {}'.format(ho_score))
rp_cluster['em label']=pred
rp_cluster.columns
fig=plt.figure(figsize=(12,12))
sns.pairplot(rp_cluster[['rp1', 'rp2', 'rp3', 'rp4', 'rp5', 'rp6','em label']],
hue='em label')
cf_matrix=confusion_matrix(rp_cluster['label'], rp_cluster['em label'])
plot_CFmatrix(cf_matrix)
# calculate feature importance
rfc=RandomForestClassifier(n_estimators=200, n_jobs=-1)
rfc.fit(scaled_health,df_health_dn['outcome'])
feature_imp=rfc.feature_importances_
df_imp=pd.DataFrame(data=feature_imp, columns=['imp'])
df_imp['features']=X_health.columns
df_imp=df_imp.sort_values(by=['imp'], ascending=False).reset_index(drop=True)
#print(df_imp)
cum_imp=np.cumsum(df_imp['imp'])
#print(cum_imp)
# plot feature importance
fig=plt.figure(figsize=(8,5))
plt.plot(df_imp.index, df_imp['imp'], marker='o')
plt.plot(df_imp.index, cum_imp, marker='o')
plt.axhline(y=0.9, linestyle='--', color='r')
plt.title('feature importance')
plt.xlabel('number of features')
plt.ylabel('feature importance')
# Clustering Kmeans
sel_features=df_imp['features'][0:34]
rf_health=X_health[sel_features]
#print(sel_features)
km=KMeans(n_clusters=2)
km.fit(rf_health)
pred=km.predict(rf_health)
score=km.score(rf_health)
ho_score=homogeneity_score(df_health_dn['outcome'], pred)
si_score=silhouette_score(rf_health, pred)
ca_score=calinski_harabasz_score(rf_health, pred)
print(-score, ho_score, si_score, ca_score)
# plot pairwise
rf_cluster=rf_health.copy()
rf_cluster['cluster label']=pred
rf_cluster['label']=np.array(df_health_dn['outcome'])
fig=plt.figure(figsize=(12,12))
sns.pairplot(rf_cluster[['Anion gap', 'Lymphocyte', 'Blood calcium',
'Bicarbonate','Lactic acid', 'Platelets','cluster label']],
hue='cluster label')
cf_matrix=confusion_matrix(rf_cluster['label'], rf_cluster['cluster label'])
plot_CFmatrix(cf_matrix)
# Clustering EM
gm=GaussianMixture(n_components=2)
gm.fit(rf_health)
pred=gm.predict(rf_health)
score=gm.score(rf_health)
aic=gm.aic(rf_health)
bic=gm.bic(rf_health)
ho_score=homogeneity_score(df_health_dn['outcome'], pred)
print('score: {}'.format(score))
print('AIC: {}'.format(aic))
print('BIC: {}'.format(bic))
print('homogeneity: {}'.format(ho_score))
rf_cluster['em label']=pred
rf_health.columns
fig=plt.figure(figsize=(12,12))
sns.pairplot(rf_cluster[['Anion gap', 'Lymphocyte', 'Blood calcium',
'Bicarbonate','Lactic acid', 'Platelets','em label']],
hue='em label')
cf_matrix=confusion_matrix(rf_cluster['label'], rf_cluster['em label'])
plot_CFmatrix(cf_matrix)
pca=PCA(n_components=7, random_state=3)
pca.fit(scaled_titanic)
varr_cum=np.cumsum(pca.explained_variance_ratio_)
# plot explained variance ratio
fig=plt.figure(figsize=(12,7))
plt.plot(np.arange(1,8), varr_cum, marker='o', label='cumulative explained variance ratio')
plt.plot(np.arange(1,8), pca.explained_variance_ratio_, marker='o',
label='explained variance ratio')
plt.legend()
plt.xticks(np.arange(1,8))
plt.axhline(y=0.9, linestyle='--', color='r')
plt.title('explained variance ratio -- titanic')
plt.ylabel('explained variance ratio')
plt.xlabel('number of components')
fig=plt.figure(figsize=(12,8))
plt.bar(np.arange(1,8), pca.singular_values_)
plt.xticks(np.arange(1,8))
plt.title('Eigenvalues -- titanic')
plt.ylabel('Eigenvalue')
plt.xlabel('number of components')
# clustering Kmeans
pca=PCA(n_components=6, random_state=3)
pca.fit(scaled_titanic)
pca_titanic=pca.transform(scaled_titanic)
km=KMeans(n_clusters=2)
km.fit(pca_titanic)
pred=km.predict(pca_titanic)
score=km.score(pca_titanic)
ho_score=homogeneity_score(df_titanic_f['Survived'], pred)
si_score=silhouette_score(pca_titanic, pred)
ca_score=calinski_harabasz_score(pca_titanic, pred)
print(-score, ho_score, si_score, ca_score)
# scatter plot
#print(pca_health)
columns=['pc1','pc2','pc3', 'pc4', 'pc5', 'pc6']
pca_cluster_t=pd.DataFrame(data=pca_titanic, columns=columns)
pca_cluster_t['cluster label']=pred
pca_cluster_t['label']=df_titanic_f['Survived']
fig=plt.figure(figsize=(12,12))
sns.pairplot(pca_cluster_t.drop('label',axis=1), hue='cluster label')
plt.title('PCA + Kmeans pair plot')
cf_matrix=confusion_matrix(pca_cluster_t['label'], pca_cluster_t['cluster label'])
plot_CFmatrix(cf_matrix)
gm=GaussianMixture(n_components=2)
gm.fit(pca_titanic)
pred=gm.predict(pca_titanic)
score=gm.score(pca_titanic)
aic=gm.aic(pca_titanic)
bic=gm.bic(pca_titanic)
ho_score=homogeneity_score(df_titanic_f['Survived'], pred)
print('score: {}'.format(score))
print('AIC: {}'.format(aic))
print('BIC: {}'.format(bic))
print('homogeneity: {}'.format(ho_score))
pca_cluster_t['em label']=pred
# scatter plot
fig=plt.figure(figsize=(12,12))
sns.pairplot(pca_cluster_t.drop(['cluster label', 'label'],axis=1), hue='em label')
plt.title('PCA + EM pair plot')
cf_matrix=confusion_matrix(pca_cluster_t['label'], pca_cluster_t['em label'])
plot_CFmatrix(cf_matrix)
avg_kurs=[]
for i in range(1,8):
ica=ICA(n_components=i)
ica.fit(scaled_titanic)
kurtose=np.mean(kurt(ica.components_))
avg_kurs.append(kurtose)
# plot kurtoses
fig=plt.figure(figsize=(8,5))
plt.plot(np.arange(1,8), avg_kurs, marker='o')
plt.title('ICA Kurtoses - titanic')
plt.xlabel('number of components')
plt.ylabel('Avg Kurtoses')
ica=ICA(n_components=7, max_iter=1000)
ica.fit(scaled_titanic)
ica_titanic=ica.transform(scaled_titanic)
km=KMeans(n_clusters=2)
km.fit(ica_titanic)
pred=km.predict(ica_titanic)
score=km.score(ica_titanic)
ho_score=homogeneity_score(df_titanic_f['Survived'], pred)
si_score=silhouette_score(ica_titanic, pred)
ca_score=calinski_harabasz_score(ica_titanic, pred)
print(-score, ho_score, si_score, ca_score)
columns=['ic1','ic2','ic3', 'ic4', 'ic5', 'ic6','ic7']
ica_cluster_t=pd.DataFrame(data=ica_titanic, columns=columns)
ica_cluster_t['cluster label']=pred
ica_cluster_t['label']=df_titanic_f['Survived']
fig=plt.figure(figsize=(12,12))
sns.pairplot(ica_cluster_t.drop('label',axis=1), hue='cluster label')
plt.title('ICA + Kmeans pair plot')
cf_matrix=confusion_matrix(ica_cluster_t['label'], ica_cluster_t['cluster label'])
plot_CFmatrix(cf_matrix)
gm=GaussianMixture(n_components=2)
gm.fit(ica_titanic)
pred=gm.predict(ica_titanic)
score=gm.score(ica_titanic)
aic=gm.aic(ica_titanic)
bic=gm.bic(ica_titanic)
ho_score=homogeneity_score(df_titanic_f['Survived'], pred)
print('score: {}'.format(score))
print('AIC: {}'.format(aic))
print('BIC: {}'.format(bic))
print('homogeneity: {}'.format(ho_score))
ica_cluster_t['em label']=pred
# scatter plot
fig=plt.figure(figsize=(12,12))
sns.pairplot(ica_cluster_t.drop(['cluster label', 'label'],axis=1), hue='em label')
plt.title('ICA + EM pair plot')
cf_matrix=confusion_matrix(ica_cluster_t['label'], ica_cluster_t['em label'])
plot_CFmatrix(cf_matrix)
rec_error_comp=[]
for i in range(1,8):
rec_erros=[]
for j in range(10):
error=compute_recon(i, scaled_titanic)
rec_errors.append(error)
#print(rec_errors)
rec_error_comp.append(np.mean(rec_errors))
# plot recon_error
fig=plt.figure(figsize=(8,5))
plt.plot(np.arange(1,8), rec_error_comp, marker='o')
plt.title('RP reconstruction error - titanic')
plt.xlabel('number of components')
plt.ylabel('reconstruction error')
# clustering Kmeans
rp=GRP(n_components=5, random_state=3)
rp.fit(scaled_titanic)
rp_titanic=rp.transform(scaled_titanic)
km=KMeans(n_clusters=2)
km.fit(rp_titanic)
pred=km.predict(rp_titanic)
score=km.score(rp_titanic)
ho_score=homogeneity_score(df_titanic_f['Survived'], pred)
si_score=silhouette_score(rp_titanic, pred)
ca_score=calinski_harabasz_score(rp_titanic, pred)
print(-score, ho_score, si_score, ca_score)
columns=column_names(5, 'rp')
rp_cluster_t=pd.DataFrame(data=rp_titanic, columns=columns)
rp_cluster_t['cluster label']=pred
rp_cluster_t['label']=df_titanic_f['Survived']
fig=plt.figure(figsize=(12,12))
sns.pairplot(rp_cluster_t.drop('label',axis=1), hue='cluster label')
plt.title('RP + Kmeans pair plot')
cf_matrix=confusion_matrix(rp_cluster_t['label'], rp_cluster_t['cluster label'])
plot_CFmatrix(cf_matrix)
gm=GaussianMixture(n_components=2)
gm.fit(rp_titanic)
pred=gm.predict(rp_titanic)
score=gm.score(rp_titanic)
aic=gm.aic(rp_titanic)
bic=gm.bic(rp_titanic)
ho_score=homogeneity_score(df_titanic_f['Survived'], pred)
print('score: {}'.format(score))
print('AIC: {}'.format(aic))
print('BIC: {}'.format(bic))
print('homogeneity: {}'.format(ho_score))
rp_cluster_t['em label']=pred
# scatter plot
fig=plt.figure(figsize=(12,12))
sns.pairplot(rp_cluster_t.drop(['cluster label', 'label'],axis=1), hue='em label')
plt.title('RP + EM pair plot')
cf_matrix=confusion_matrix(rp_cluster_t['label'], rp_cluster_t['em label'])
plot_CFmatrix(cf_matrix)
# calculate feature importance
rfc=RandomForestClassifier(n_estimators=200, n_jobs=-1)
rfc.fit(scaled_titanic,df_titanic_f['Survived'])
feature_imp=rfc.feature_importances_
df_imp=pd.DataFrame(data=feature_imp, columns=['imp'])
df_imp['features']=X_titanic.columns
df_imp=df_imp.sort_values(by=['imp'], ascending=False).reset_index(drop=True)
#print(df_imp)
cum_imp=np.cumsum(df_imp['imp'])
#print(cum_imp)
# plot feature importance
fig=plt.figure(figsize=(8,5))
plt.plot(df_imp.index, df_imp['imp'], marker='o')
plt.plot(df_imp.index, cum_imp, marker='o')
plt.axhline(y=0.9, linestyle='--', color='r')
plt.title('feature importance')
plt.xlabel('number of features')
plt.ylabel('feature importance')
# Clustering Kmeans
sel_features=df_imp['features'][0:4]
rf_titanic=X_titanic[sel_features]
#print(sel_features)
km=KMeans(n_clusters=2)
km.fit(rf_titanic)
pred=km.predict(rf_titanic)
score=km.score(rf_titanic)
ho_score=homogeneity_score(df_titanic_f['Survived'], pred)
si_score=silhouette_score(rf_titanic, pred)
ca_score=calinski_harabasz_score(rf_titanic, pred)
print(-score, ho_score, si_score, ca_score)
# plot pairwise
rf_cluster_t=rf_titanic.copy()
rf_cluster_t['cluster label']=pred
rf_cluster_t['label']=np.array(df_titanic_f['Survived'])
fig=plt.figure(figsize=(12,12))
sns.pairplot(rf_cluster_t.drop('label',axis=1), hue='cluster label')
plt.title('RF + Kmeans pair plot')
cf_matrix=confusion_matrix(rf_cluster_t['label'], rf_cluster_t['cluster label'])
plot_CFmatrix(cf_matrix)
# Clustering EM
gm=GaussianMixture(n_components=2)
gm.fit(rf_titanic)
pred=gm.predict(rf_titanic)
score=gm.score(rf_titanic)
aic=gm.aic(rf_titanic)
bic=gm.bic(rf_titanic)
ho_score=homogeneity_score(df_titanic_f['Survived'], pred)
print('score: {}'.format(score))
print('AIC: {}'.format(aic))
print('BIC: {}'.format(bic))
print('homogeneity: {}'.format(ho_score))
rf_cluster_t['em label']=pred
# scatter plot
fig=plt.figure(figsize=(12,12))
sns.pairplot(rf_cluster_t.drop(['cluster label', 'label'],axis=1), hue='em label')
plt.title('RF + EM pair plot')
cf_matrix=confusion_matrix(rp_cluster_t['label'], rp_cluster_t['em label'])
plot_CFmatrix(cf_matrix)
# learning curve
def learning_curve(clf, X, y):
train_scores=[]
val_scores=[]
cv_scores=[]
train_sizes=np.linspace(0.05*len(y), len(y), 20, dtype='int')
train_times=[]
val_times=[]
for i in train_sizes:
scores=cross_validate(clf, X=X.iloc[0:i,:], y=y.iloc[0:i], scoring='f1', return_train_score=True)
#dt.fit(X_train_h.iloc[0:i,:], y_train_h.iloc[0:i])
train_scores.append(np.mean(scores['train_score']))
val_scores.append(np.mean(scores['test_score']))
train_times.append(np.mean(scores['fit_time']))
val_times.append(np.mean(scores['score_time']))
fig1=plt.figure(figsize=(8,5))
plt.plot(train_sizes, train_scores, marker='o', label='train')
plt.plot(train_sizes, val_scores, marker='o', label='validation')
plt.legend()
plt.title('learning curve')
plt.xlabel('training examples')
plt.ylabel('f1 score')
plt.show()
# plot time
fig2=plt.figure(figsize=(8,5))
plt.plot(train_sizes, train_times, marker='o', label='train_time')
plt.plot(train_sizes, val_times, marker='o', label='validation_time')
plt.legend()
plt.title('Modeling time')
plt.xlabel('training examples')
plt.ylabel('fitting time(s)')
plt.show()
return train_scores, val_scores, train_times, val_times
# train neural nets
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(scaled_health),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam', max_iter=2000)
nn_ts, nn_vs, nn_tt, nn_vt=learning_curve(nn, X_train, y_train)
# train pca health
X_train_pca, X_test_pca, y_train, y_test = train_test_split(pd.DataFrame(pca_health),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam',max_iter=2000)
nn_ts_pca, nn_vs_pca, nn_tt_pca, nn_vt_pca=learning_curve(nn, X_train_pca, y_train)
# train ica health
X_train_ica, X_test_ica, y_train, y_test = train_test_split(pd.DataFrame(ica_health),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam',max_iter=2000)
nn_ts_ica, nn_vs_ica, nn_tt_ica, nn_vt_ica=learning_curve(nn, X_train_ica, y_train)
# train rp health
X_train_rp, X_test_rp, y_train, y_test = train_test_split(pd.DataFrame(rp_health),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam',max_iter=2000)
nn_ts_rp, nn_vs_rp, nn_tt_rp, nn_vt_rp=learning_curve(nn, X_train_rp, y_train)
# train rf health
X_train_rf, X_test_rf, y_train, y_test = train_test_split(pd.DataFrame(rf_health),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam',max_iter=2000)
nn_ts_rf, nn_vs_rf, nn_tt_rf, nn_vt_rf=learning_curve(nn, X_train_rf, y_train)
# plot learning curve
train_sizes=np.linspace(len(y_train)*0.05, len(y_train), 20, dtype='int')
fig=plt.figure(figsize=(8,5))
plt.plot(train_sizes, nn_vs, marker='o', label='neural nets')
plt.plot(train_sizes, nn_vs_pca, marker='o', label='pca+nn')
plt.plot(train_sizes, nn_vs_ica, marker='o', label='ica+nn')
plt.plot(train_sizes, nn_vs_rp, marker='o', label='rp+nn')
plt.plot(train_sizes, nn_vs_rf, marker='o', label='rf+nn')
plt.legend()
plt.title('Validation learning curve comparison')
plt.xlabel('training examples')
plt.ylabel('f1 score')
# plot fitting time
fig11=plt.figure(figsize=(8,5))
plt.plot(train_sizes, nn_tt, marker='o', label='neural nets')
plt.plot(train_sizes, nn_tt_pca, marker='o', label='pca+nn')
plt.plot(train_sizes, nn_tt_ica, marker='o', label='ica+nn')
plt.plot(train_sizes, nn_tt_rp, marker='o', label='rp+nn')
plt.plot(train_sizes, nn_tt_rf, marker='o', label='rf+nn')
plt.legend()
plt.title('Training time Comparison')
plt.xlabel('training examples')
plt.ylabel('training time(s)')
# performance
# train pca health + Kmeans
X_train_pcak, X_test_pcak, y_train, y_test = train_test_split(pca_cluster.drop(['label','em label'],axis=1),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam',max_iter=2000)
nn_ts_pcak, nn_vs_pcak, nn_tt_pcak, nn_vt_pcak=learning_curve(nn, X_train_pcak, y_train)
# train pca health + EM
X_train_pcae, X_test_pcae, y_train, y_test = train_test_split(pca_cluster.drop(['label','cluster label'],axis=1),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam',max_iter=2000)
nn_ts_pcae, nn_vs_pcae, nn_tt_pcae, nn_vt_pcae=learning_curve(nn, X_train_pcae, y_train)
# train ica health + Kmeans
X_train_icak, X_test_icak, y_train, y_test = train_test_split(ica_cluster.drop(['label','em label'],axis=1),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam',max_iter=2000)
nn_ts_icak, nn_vs_icak, nn_tt_icak, nn_vt_icak=learning_curve(nn, X_train_icak, y_train)
# train ica health + EM
X_train_icae, X_test_icae, y_train, y_test = train_test_split(ica_cluster.drop(['label','cluster label'],axis=1),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam',max_iter=2000)
nn_ts_icae, nn_vs_icae, nn_tt_icae, nn_vt_icae=learning_curve(nn, X_train_icae, y_train)
# train rp health + Kmeans
X_train_rpk, X_test_rpk, y_train, y_test = train_test_split(rp_cluster.drop(['label','em label'],axis=1),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam',max_iter=2000)
nn_ts_rpk, nn_vs_rpk, nn_tt_rpk, nn_vt_rpk=learning_curve(nn, X_train_rpk, y_train)
# train rp health + EM
X_train_rpe, X_test_rpe, y_train, y_test = train_test_split(rp_cluster.drop(['label','cluster label'],axis=1),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam',max_iter=2000)
nn_ts_rpe, nn_vs_rpe, nn_tt_rpe, nn_vt_rpe=learning_curve(nn, X_train_rpe, y_train)
# train rf health + Kmeans
X_train_rfk, X_test_rfk, y_train, y_test = train_test_split(rf_cluster.drop(['label','em label'],axis=1),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam',max_iter=2000)
nn_ts_rfk, nn_vs_rfk, nn_tt_rfk, nn_vt_rfk=learning_curve(nn, X_train_rfk, y_train)
# train rf health + EM
X_train_rfe, X_test_rfe, y_train, y_test = train_test_split(rf_cluster.drop(['label','cluster label'],axis=1),
df_health_dn['outcome'],
test_size=0.30)
nn=MLPClassifier(hidden_layer_sizes=(10,), solver='adam',max_iter=2000)
nn_ts_rfe, nn_vs_rfe, nn_tt_rfe, nn_vt_rfe=learning_curve(nn, X_train_rfe, y_train)
# performance comparison
train_sizes=np.linspace(len(y_train)*0.05, len(y_train), 20, dtype='int')
fig=plt.figure(figsize=(8,5))
plt.plot(train_sizes, nn_vs, marker='o', label='neural nets')
plt.plot(train_sizes, nn_vs_pcak, marker='o', label='pca+kmeans+nn')
plt.plot(train_sizes, nn_vs_pcae, marker='o', label='pca+EM+nn')
plt.plot(train_sizes, nn_vs_icak, marker='o', label='ica+Kmeans+nn')
plt.plot(train_sizes, nn_vs_icae, marker='o', label='ica+EM+nn')
plt.plot(train_sizes, nn_vs_rpk, marker='o', label='rp+Kmeans+nn')
plt.plot(train_sizes, nn_vs_rpe, marker='o', label='rp+EM+nn')
plt.plot(train_sizes, nn_vs_rfk, marker='o', label='rf+Kmeans+nn')
plt.plot(train_sizes, nn_vs_rfe, marker='o', label='rf+EM+nn')
plt.legend()
plt.title('Validation learning curve comparison')
plt.xlabel('training examples')
plt.ylabel('f1 score')
# training time comparison
fig=plt.figure(figsize=(8,5))
plt.plot(train_sizes, nn_tt, marker='o', label='neural nets')
plt.plot(train_sizes, nn_tt_pcak, marker='o', label='pca+kmeans+nn')
plt.plot(train_sizes, nn_tt_pcae, marker='o', label='pca+EM+nn')
plt.plot(train_sizes, nn_tt_icak, marker='o', label='ica+Kmeans+nn')
plt.plot(train_sizes, nn_tt_icae, marker='o', label='ica+EM+nn')
plt.plot(train_sizes, nn_tt_rpk, marker='o', label='rp+Kmeans+nn')
plt.plot(train_sizes, nn_tt_rpe, marker='o', label='rp+EM+nn')
plt.plot(train_sizes, nn_tt_rfk, marker='o', label='rf+Kmeans+nn')
plt.plot(train_sizes, nn_tt_rfe, marker='o', label='rf+EM+nn')
plt.legend()
plt.title('Training time Comparison')
plt.xlabel('training examples')
plt.ylabel('training time(s)')